df <- read.table("emissionssw.dat", header=TRUE)
head(df)
##      nox     noxem     ws  humidity
## 1 170.25 1580.1338 0.9000 0.3195116
## 2 314.40 3248.0421 1.0500 0.4041845
## 3 270.15 3207.5551 0.9665 0.3715211
## 4 177.65 2798.4168 1.0335 0.3065146
## 5 168.00 3181.8449 2.0165 0.2604632
## 6  75.10  270.7519 1.5170 0.2102720

Quite a few outliers across all time series. Consider quantile clipping to prevent influence of these observations? Maybe even remove from data and don’t bother trying to predict

plot.ts(df)

lapply(names(df), function(col) {
  acf(df[[col]], main = paste("ACF for", col))
})

## [[1]]
## 
## Autocorrelations of series 'df[[col]]', by lag
## 
##     0     1     2     3     4     5     6     7     8     9    10    11    12 
## 1.000 0.448 0.230 0.199 0.207 0.238 0.184 0.143 0.124 0.149 0.122 0.068 0.046 
##    13    14    15    16    17    18    19    20    21    22    23    24    25 
## 0.020 0.031 0.060 0.035 0.046 0.076 0.045 0.035 0.052 0.071 0.067 0.081 0.053 
##    26    27    28    29    30    31    32    33 
## 0.034 0.030 0.044 0.062 0.053 0.063 0.072 0.104 
## 
## [[2]]
## 
## Autocorrelations of series 'df[[col]]', by lag
## 
##      0      1      2      3      4      5      6      7      8      9     10 
##  1.000  0.496  0.043 -0.145 -0.109  0.040  0.122  0.135  0.076  0.015 -0.039 
##     11     12     13     14     15     16     17     18     19     20     21 
## -0.054 -0.039 -0.026  0.020  0.033  0.012 -0.013 -0.037 -0.053 -0.038 -0.005 
##     22     23     24     25     26     27     28     29     30     31     32 
##  0.021 -0.001 -0.040 -0.051 -0.034 -0.027 -0.010  0.004  0.012  0.000  0.002 
##     33 
##  0.039 
## 
## [[3]]
## 
## Autocorrelations of series 'df[[col]]', by lag
## 
##      0      1      2      3      4      5      6      7      8      9     10 
##  1.000  0.616  0.385  0.273  0.223  0.202  0.159  0.143  0.092  0.058  0.033 
##     11     12     13     14     15     16     17     18     19     20     21 
##  0.009  0.018  0.003  0.010 -0.003 -0.015 -0.002 -0.007 -0.013 -0.016 -0.011 
##     22     23     24     25     26     27     28     29     30     31     32 
## -0.006 -0.007  0.010  0.042  0.046  0.017  0.001  0.023  0.031  0.054  0.082 
##     33 
##  0.115 
## 
## [[4]]
## 
## Autocorrelations of series 'df[[col]]', by lag
## 
##      0      1      2      3      4      5      6      7      8      9     10 
##  1.000  0.751  0.571  0.443  0.359  0.260  0.189  0.137  0.086  0.052  0.024 
##     11     12     13     14     15     16     17     18     19     20     21 
##  0.012  0.003 -0.005 -0.007 -0.006 -0.012 -0.012 -0.007 -0.006 -0.014 -0.018 
##     22     23     24     25     26     27     28     29     30     31     32 
## -0.015 -0.012 -0.003  0.009  0.025  0.009 -0.002 -0.017 -0.023 -0.033 -0.035 
##     33 
## -0.032
plot(df$nox[1:6])

acf(log(df$nox))

summary(df)
##       nox            noxem              ws            humidity       
##  Min.   :  1.2   Min.   : 102.5   Min.   : 0.100   Min.   :0.009332  
##  1st Qu.: 47.7   1st Qu.: 675.0   1st Qu.: 1.033   1st Qu.:0.060673  
##  Median : 88.3   Median :2183.6   Median : 1.667   Median :0.100842  
##  Mean   :116.0   Mean   :2250.7   Mean   : 2.087   Mean   :0.171872  
##  3rd Qu.:152.5   3rd Qu.:3741.7   3rd Qu.: 2.750   3rd Qu.:0.168508  
##  Max.   :694.5   Max.   :5362.1   Max.   :11.367   Max.   :4.099019

lag 1/5 have the highest autocorrelations, with the rest being lower. Makes sense given martingale property + same time of day at lag-5 mark. Since lag 1 has such high autocorrelation, may be worth trying a model fitted on first order differences.

pairs(df)

m1 <- lm(nox ~ ., data=df)
summary(m1)
## 
## Call:
## lm(formula = nox ~ ., data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -166.19  -42.65  -15.38   20.03  500.14 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  92.947559   3.699776  25.122   <2e-16 ***
## noxem         0.036152   0.001077  33.573   <2e-16 ***
## ws          -27.338040   1.113436 -24.553   <2e-16 ***
## humidity     -7.227542   5.705300  -1.267    0.205    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 73.89 on 2018 degrees of freedom
## Multiple R-squared:  0.437,  Adjusted R-squared:  0.4361 
## F-statistic: 522.1 on 3 and 2018 DF,  p-value: < 2.2e-16
plot(m1)

m1 <- lm(nox ~ ., data=df, subset=-c(156, 160, 161))
summary(m1)
## 
## Call:
## lm(formula = nox ~ ., data = df, subset = -c(156, 160, 161))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -164.40  -42.27  -14.96   19.79  339.84 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  92.658462   3.613842  25.640   <2e-16 ***
## noxem         0.035642   0.001053  33.842   <2e-16 ***
## ws          -26.982235   1.088116 -24.797   <2e-16 ***
## humidity     -6.753458   5.572420  -1.212    0.226    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 72.16 on 2015 degrees of freedom
## Multiple R-squared:  0.4412, Adjusted R-squared:  0.4404 
## F-statistic: 530.4 on 3 and 2015 DF,  p-value: < 2.2e-16
plot(m1)

looks like a quadratic u shaped relationship between residuals and fitted. Let’s try squaring humidity since that was insignificant and looked nonlinear

m2 <- lm(nox ~. + I(humidity^2), data=df)
summary(m2)
## 
## Call:
## lm(formula = nox ~ . + I(humidity^2), data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -166.23  -42.65  -15.38   20.04  500.16 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    92.892893   3.898850  23.826   <2e-16 ***
## noxem           0.036153   0.001077  33.560   <2e-16 ***
## ws            -27.340544   1.115129 -24.518   <2e-16 ***
## humidity       -6.758165  11.982859  -0.564    0.573    
## I(humidity^2)  -0.202849   4.553608  -0.045    0.964    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 73.9 on 2017 degrees of freedom
## Multiple R-squared:  0.437,  Adjusted R-squared:  0.4359 
## F-statistic: 391.4 on 4 and 2017 DF,  p-value: < 2.2e-16
plot(m2)

Didn’t help, what about windspeed since that was nonlinear too. Still seeing a larger fitted value gives larger variance. Polynomials degree two don’t seem to work, though there is some improvement. Common theme is that humidity doesn’t seem to be important.

# try quantile clipping on data to deal with outliers. Park idea for now
# library(raster)
# dfclipped = clip.data(select(df, c('noxem', 'ws')), lower=0.01, upper=0.99)
# print(dfclipped)
# 
# logging the response gives a good residuals vs fitted plot
# Attempt to fit WLS
# add convergence criterion
max_iter = 100
m3w <- lm(log(nox) ~ (noxem + ws)^2 + I(ws^2) + I(noxem^2), data=df)
for (i in 1:max_iter) {
  m3w <- lm(log(nox) ~ (noxem + ws)^2 + I(ws^2) + I(noxem^2), data=df, weights = 1/abs(resid(m3w)))
}
summary(m3w)
## 
## Call:
## lm(formula = log(nox) ~ (noxem + ws)^2 + I(ws^2) + I(noxem^2), 
##     data = df, weights = 1/abs(resid(m3w)))
## 
## Weighted Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.39062 -0.58394 -0.08857  0.56738  1.27014 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.203e+00  1.201e-02  349.94   <2e-16 ***
## noxem        7.555e-04  1.138e-05   66.42   <2e-16 ***
## ws          -5.606e-01  9.969e-03  -56.23   <2e-16 ***
## I(ws^2)      2.248e-02  1.019e-03   22.07   <2e-16 ***
## I(noxem^2)  -9.375e-08  2.369e-09  -39.57   <2e-16 ***
## noxem:ws     4.312e-05  1.794e-06   24.04   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6549 on 2016 degrees of freedom
## Multiple R-squared:  0.9953, Adjusted R-squared:  0.9953 
## F-statistic: 8.629e+04 on 5 and 2016 DF,  p-value: < 2.2e-16
plot(m3w)

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

library(MASS)
mlog <- lm(log(nox) ~ (noxem + ws)^2 + I(ws^2) + I(noxem^2), data=df)
summary(mlog)
## 
## Call:
## lm(formula = log(nox) ~ (noxem + ws)^2 + I(ws^2) + I(noxem^2), 
##     data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.90335 -0.32465  0.00147  0.34405  1.61911 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.180e+00  4.915e-02  85.040  < 2e-16 ***
## noxem        7.898e-04  3.471e-05  22.754  < 2e-16 ***
## ws          -5.674e-01  2.753e-02 -20.609  < 2e-16 ***
## I(ws^2)      2.176e-02  3.081e-03   7.061 2.26e-12 ***
## I(noxem^2)  -9.891e-08  6.868e-09 -14.403  < 2e-16 ***
## noxem:ws     4.090e-05  5.608e-06   7.293 4.35e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5506 on 2016 degrees of freedom
## Multiple R-squared:  0.6619, Adjusted R-squared:  0.661 
## F-statistic: 789.2 on 5 and 2016 DF,  p-value: < 2.2e-16
plot(mlog)

mlog_robust <- rlm(log(nox) ~ (noxem + ws)^2 + I(ws^2) + I(noxem^2), data=df)
summary(mlog_robust)
## 
## Call: rlm(formula = log(nox) ~ (noxem + ws)^2 + I(ws^2) + I(noxem^2), 
##     data = df)
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.9202174 -0.3262980  0.0002274  0.3389421  1.6371054 
## 
## Coefficients:
##             Value    Std. Error t value 
## (Intercept)   4.2138   0.0490    85.9433
## noxem         0.0007   0.0000    21.4065
## ws           -0.5678   0.0275   -20.6762
## I(ws^2)       0.0228   0.0031     7.4077
## I(noxem^2)    0.0000   0.0000   -12.9091
## noxem:ws      0.0000   0.0000     7.0525
## 
## Residual standard error: 0.4964 on 2016 degrees of freedom
plot(mlog_robust)

More insights from question: - measurements sequential over one year, sorted by day,time. Need to take this dependence into account - hypothesis: noxem/ws are important, noxem is the source, ws is the dispersion. Humidity may have an impact, but so far it’s not showing.

General: - train/test set, CV? for model selection/evaluating performance while checking for overfitting

Dealing with time dependence: - adding lagged/smoothed/rolling mean versions of variables (can try lag 1 given high autocorr) - adding time index/seasonal predictor? (low priority)

Tried first order differencing, not really any that much better than the striaght linear model it seems. Still the same U-shaped residual problem.

df["dy"] <- c(NaN, diff(df$nox))
# m4 <- lm(dy ~ noxem + ws + humidity, data=df)
m4 <- lm(dy ~ (noxem + ws)^2 + I(ws^2) + I(noxem^2), data=df)
summary(m4)
## 
## Call:
## lm(formula = dy ~ (noxem + ws)^2 + I(ws^2) + I(noxem^2), data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -524.92  -39.39    9.04   43.61  458.49 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.311e+01  8.618e+00  -3.842 0.000126 ***
## noxem        5.190e-02  6.086e-03   8.528  < 2e-16 ***
## ws          -1.914e+01  4.827e+00  -3.966 7.58e-05 ***
## I(ws^2)      2.683e+00  5.402e-01   4.967 7.38e-07 ***
## I(noxem^2)  -4.662e-06  1.204e-06  -3.872 0.000112 ***
## noxem:ws    -5.392e-03  9.832e-04  -5.485 4.67e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 96.52 on 2015 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1304, Adjusted R-squared:  0.1282 
## F-statistic: 60.41 on 5 and 2015 DF,  p-value: < 2.2e-16
plot(m4)